/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import mosdi.fa.Alphabet;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.GeneralizedString;
import mosdi.fa.IIDTextModel;
import mosdi.paa.ClumpSizeCalculator;
import mosdi.util.Iupac;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class EmpiricClumpStatSubcommand extends Subcommand {
	
	@Override
	public String usage() {
		return
		super.usage()+" [options] <iupac-pattern>\n" +
		"\n" +
		"Options:\n" +
		"  -M <order>: Order of the Markov model. Default: 0 (i.i.d.)\n" +
		"  -r: simultaneously consider reverse complementary motif\n" +
		"  -t <fasta-file>: Estimate distribution from given sequence(s).\n" +
		"  -i <iterations>: number of sequences to be generated (default: 10000)\n"+
		"  -l <length>: length of sequences (default: 100)\n" +
		"  -n <max-matches>: maximum number of matches in distribution (default: 15)\n" +
		"  -c <max-clump-size>: size of clump size distribution (default: 15)\n" +
		"  -C: additionally compute exact clump statistics";
	}
	
	@Override
	public String description() {
		return "Obtains clump statistics by simulation.";
	}

	@Override
	public String name() {
		return "empiric-clump-stat";
	}

	@Override
	public int run(String[] args) {
		parseOptions(args, 1, "t:M:i:l:n:rc:C");

		// Option dependencies
		impliedOptions("M", "t");

		// Mandatory arguments
		String pattern = getStringArgument(0);

		// Options
		int textModelOrder = getNonNegativeIntOption("M", 0);
		int iterations = getPositiveIntOption("i", 10000);
		int sequenceLength = getPositiveIntOption("l", 100);
		// maximal number of matches to be counted (size of distribution array)
		int maxMatches = getNonNegativeIntOption("n", 15);
		boolean considerReverse = getBooleanOption("r", false);
		int maxClumpSize = getPositiveIntOption("c", 15);
		boolean exactClumpStatistics = getBooleanOption("C", false);
		String fastaFilename = getStringOption("t", null);

		Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		List<NamedSequence> modelEstimationSequences = null;

		FiniteMemoryTextModel textModel = null;
		if (fastaFilename!=null) {
			try {
				modelEstimationSequences = SequenceUtils.readFastaFile(fastaFilename, dnaAlphabet, true);
			} catch (Exception e) { 
				Log.errorln("Error reading "+fastaFilename+":");
				Log.errorln(e.getMessage());
				System.exit(1);
			}
		}

		if (pattern.length()<=1) {
			Log.errorln("Error: Pattern too short.");
			System.exit(1);
		}
		
		// create text model
		if (modelEstimationSequences==null) {
			if (textModelOrder>0) {
				Log.errorln("Error: Text model undefined.");
				System.exit(1);
			}
			textModel = new IIDTextModel(dnaAlphabet.size());
		} else {
			textModel = SequenceUtils.buildTextModelFromNamedSequences(modelEstimationSequences, dnaAlphabet.size(), textModelOrder);
		}

		// create CDFA
		List<GeneralizedString> genStringList = new ArrayList<GeneralizedString>(1);
		genStringList.add(Iupac.toGeneralizedString(pattern));
		if (considerReverse) {
			String reversePattern = Iupac.reverseComplementary(pattern);
			genStringList.add(Iupac.toGeneralizedString(reversePattern));
		}
		
		// create cdfa
		CDFA cdfa = DFAFactory.build(dnaAlphabet, genStringList, 50000);
		Log.printf(Log.Level.STANDARD, "DFA states: %d%n", cdfa.getStateCount());
		cdfa = cdfa.minimizeHopcroft();
		Log.printf(Log.Level.STANDARD, "DFA states (minimized): %d%n", cdfa.getStateCount());

		// scan text with cdfa
		int totalClumps = 0;
		int[] clumpSizeFrequencies = new int[maxClumpSize+1];
		int totalMatches = 0;
		int[] matchCountFrequencies = new int[maxMatches+1];
		int[] clumpStartStateFrequencies = new int[cdfa.getStateCount()];
		for (int i=0; i<iterations; ++i) {
			int[] s = textModel.generateRandomText(sequenceLength);
			int state = cdfa.getStartState();
			int lastMatchPos = -1;
			int currentClumpSize = -1;
			int matches = 0;
			for (int pos=0; pos<sequenceLength; ++pos) {
				state = cdfa.getTransitionTarget(state, s[pos]);
				int stateOutput = cdfa.getStateOutput(state); 
				if (stateOutput>0) {
					lastMatchPos = pos;
					matches+=stateOutput;
					if (currentClumpSize==-1) {
						currentClumpSize = stateOutput;
						clumpStartStateFrequencies[state]+=1;
					} else {
						currentClumpSize+=stateOutput;
					}
				} else {
					if ((currentClumpSize!=-1) && (lastMatchPos==pos-pattern.length()+1)) {
						if (currentClumpSize<=maxClumpSize)	clumpSizeFrequencies[currentClumpSize]+=1;	
						totalClumps+=1;
						currentClumpSize=-1;
					}
				}
			}
			if (currentClumpSize!=-1) {
				if (currentClumpSize<=maxClumpSize)	clumpSizeFrequencies[currentClumpSize]+=1;	
				totalClumps+=1;
			}
			matchCountFrequencies[Math.min(matches,maxMatches)]+=1;
			totalMatches+=matches;
		}
		double[] empiricClumpSizeDist = new double[maxClumpSize+1];
		for (int i=0; i<=maxClumpSize; ++i) empiricClumpSizeDist[i]=((double)clumpSizeFrequencies[i])/totalClumps;
		double[] matchCountDist = new double[maxMatches+1];
		for (int i=0; i<=maxMatches; ++i) matchCountDist[i]=((double)matchCountFrequencies[i])/iterations;
		double[] clumpStartStateDist = new double[cdfa.getStateCount()];
		for (int i=0; i<cdfa.getStateCount(); ++i) clumpStartStateDist[i]=((double)clumpStartStateFrequencies[i])/totalClumps;
		Log.printf(Log.Level.STANDARD, "Seen %d matches in %d clumps in %s sequences (avg. clump size: %f)%n", totalMatches, totalClumps, iterations, ((double)totalMatches)/totalClumps);
		Log.printf(Log.Level.STANDARD, "Empiric clump size frequencies: %s%n", Arrays.toString(clumpSizeFrequencies));
		Log.printf(Log.Level.STANDARD, "Empiric match count frequencies: %s%n", Arrays.toString(matchCountFrequencies));
		Log.printf(Log.Level.STANDARD, "Empiric clump start state frequencies: %s%n", Arrays.toString(clumpStartStateFrequencies));
		Log.println(Log.Level.STANDARD, "");
		Log.printf(Log.Level.STANDARD, "Empiric clump size distribution: %s%n", Arrays.toString(empiricClumpSizeDist));
		Log.printf(Log.Level.STANDARD, "Empiric match count distribution: %s%n", Arrays.toString(matchCountDist));
		Log.printf(Log.Level.STANDARD, "Empiric clump start state distribution: %s%n", Arrays.toString(clumpStartStateDist));

		if (exactClumpStatistics) {
			Log.println(Log.Level.STANDARD, "");
			
			ClumpSizeCalculator csc = new ClumpSizeCalculator(textModel, cdfa, pattern.length());
			Log.printf(Log.Level.STANDARD, "States in product automaton: %d%n", csc.getProductStateCount());

			// calculate clump size distribution
			// double[] initialStateDistribution = mac.restrictToOutputStates(mac.equilibriumDistribution(1e-13));
			double[] initialStateDistribution = csc.computeClumpStartDistribution();
			double[] exactClumpSizeDist = csc.clumpSizeDistribution(maxClumpSize, 1e-50);
			
			// calculate expected clump size
			double exactExpectedClumpSize = 0.0;
			for (int i=1; i<exactClumpSizeDist.length; ++i) {
				exactExpectedClumpSize+=exactClumpSizeDist[i]*i;
			}

			Log.printf(Log.Level.STANDARD, "Exact clump size distribution: %s%n", Arrays.toString(exactClumpSizeDist));
			Log.printf(Log.Level.STANDARD, "Exact clump start state distribution: %s%n", Arrays.toString(initialStateDistribution));
			Log.printf(Log.Level.STANDARD, "Expected clump size: %e%n", exactExpectedClumpSize);
//			double lambda = expectation/expectedClumpSize;
//			PoissonDistribution poissonDist = new PoissonDistribution(lambda, fc);
//			double totalOccPvalue = poissonDist.compoundPoissonPValue(clumpSizeDist, totalMatches);
			
		}
		return 0;
	}
}
